readr approach# readr produces a tibble
# which then shows in the printout the number of rows, cols and types of the cols
# we gained some valuable information when we did that
bike = readr::read_csv('https://raw.githubusercontent.com/fmegahed/isa401/main/data/bike_sharing_data.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
base R approachbike_base = read.csv('https://raw.githubusercontent.com/fmegahed/isa401/main/data/bike_sharing_data.csv')
dplyr::glimpse(bike_base) # if you were to use the base r read.csv; you will also need to fix humidity
## Rows: 17,379
## Columns: 13
## $ datetime <chr> "1/1/2011 0:00", "1/1/2011 1:00", "1/1/2011 2:00", "1/1/201…
## $ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ holiday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ workingday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ weather <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,…
## $ temp <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.84, 13.12…
## $ atemp <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13.635, 12.…
## $ humidity <chr> "81", "80", "80", "75", "75", "75", "80", "86", "75", "76",…
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0000, 0.0…
## $ casual <int> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1…
## $ registered <int> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5…
## $ count <int> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110…
## $ sources <chr> "ad campaign", "www.yahoo.com", "www.google.fi", "AD campai…
dplyr::glimpse() or str()dplyr::glimpse(bike) # type of each variable, first few observations
## Rows: 17,379
## Columns: 13
## $ datetime <chr> "1/1/2011 0:00", "1/1/2011 1:00", "1/1/2011 2:00", "1/1/201…
## $ season <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ holiday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ workingday <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ weather <dbl> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,…
## $ temp <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.84, 13.12…
## $ atemp <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13.635, 12.…
## $ humidity <dbl> 81, 80, 80, 75, 75, 75, 80, 86, 75, 76, 76, 81, 77, 72, 72,…
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0000, 0.0…
## $ casual <dbl> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1…
## $ registered <dbl> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5…
## $ count <dbl> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110…
## $ sources <chr> "ad campaign", "www.yahoo.com", "www.google.fi", "AD campai…
Yes, therefore we will not change the column names.
The classes for some of the columns will need to change. So let us change them.
# datetime: it is currently a chr vector and we will change it accordingly
# this will convert the column into a datetime column and we knew the appropriate
# function from lubridate by seeing the values in that column and going down the rows
# until we saw that the second number had 13 and 14 in it which means that it cannot
# be a month
bike$datetime = lubridate::mdy_hm(bike$datetime)
# we will now change the columns season to weather to factor
# we could have done this in a way similar to line 53, but we will try to be
# "lazy" and do it in one/two steps (instead of four)
bike = bike |>
# converted the columns season:weather into chr
dplyr::mutate(dplyr::across(.cols = season:weather, .fn = as.character) ) |>
# converted them into a factor (could have also used forcats::as_factor instead)
dplyr::mutate(dplyr::across(.cols = season:weather, .fn = as.factor) )
dplyr::glimpse(bike)
## Rows: 17,379
## Columns: 13
## $ datetime <dttm> 2011-01-01 00:00:00, 2011-01-01 01:00:00, 2011-01-01 02:00…
## $ season <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ holiday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ workingday <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ weather <fct> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,…
## $ temp <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.84, 13.12…
## $ atemp <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13.635, 12.…
## $ humidity <dbl> 81, 80, 80, 75, 75, 75, 80, 86, 75, 76, 76, 81, 77, 72, 72,…
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0000, 0.0…
## $ casual <dbl> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1…
## $ registered <dbl> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5…
## $ count <dbl> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110…
## $ sources <chr> "ad campaign", "www.yahoo.com", "www.google.fi", "AD campai…
Our dataset now has reasonable column names (we did not change that) and correct column types. Therefore, our data is now technically correct.
pointblank packagelibrary(pointblank) # we will load it since we will be using a bunch of funtions from it
# this will save some time pointblank::fun_name
# rereading the data since we already fixed bike
bike_tbl = readr::read_csv('https://raw.githubusercontent.com/fmegahed/isa401/main/data/bike_sharing_data.csv')
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 17379 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): datetime, sources
## dbl (11): season, holiday, workingday, weather, temp, atemp, humidity, winds...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# decimal means that it is the pct of rows, while a number means number of rows
act = action_levels(warn_at = 0.01, notify_at = 0.01, stop_at = 0.01)
act # we do not need to set them all the same (it is done here for the example)
## -- The `action_levels` settings
## WARN failure threshold of 0.01 of all test units.
## STOP failure threshold of 0.01 of all test units.
## NOTIFY failure threshold of 0.01 of all test units.
## ----
agent =
# create agent starts the table of checks
create_agent(tbl = bike_tbl, actions = act) |>
# we will populate the rows without doing the checks (checks are done in the interrogation step)
col_is_date(columns = datetime) |>
# given that the columns were consecutive I can use col_start:col_end
# otherwise you need to use the function, see col_if_... for details
col_is_factor(columns = season:weather) |>
# we do not really care if they are dbl vs int but I am showing you that
# if you did care, we can check for that
col_is_numeric(columns = temp:windspeed) |>
col_is_integer(columns = casual:count) |>
col_is_character(columns = vars(sources))
res = interrogate(agent = agent, sample_limit = 5000)
res
| Pointblank Validation | |||||||||||||
| [2023-10-03|09:52:31]
tibble
bike_tblWARN
0.01
STOP
0.01
NOTIFY
0.01
|
|||||||||||||
| STEP | COLUMNS | VALUES | TBL | EVAL | UNITS | PASS | FAIL | W | S | N | EXT | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | col_is_date()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 2 | col_is_factor()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 3 | col_is_factor()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 4 | col_is_factor()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 5 | col_is_factor()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 6 | col_is_numeric()
|
— |
|
✓ |
1 |
11.00 |
00.00 |
○ |
○ |
○ |
— | ||
| 7 | col_is_numeric()
|
— |
|
✓ |
1 |
11.00 |
00.00 |
○ |
○ |
○ |
— | ||
| 8 | col_is_numeric()
|
— |
|
✓ |
1 |
11.00 |
00.00 |
○ |
○ |
○ |
— | ||
| 9 | col_is_numeric()
|
— |
|
✓ |
1 |
11.00 |
00.00 |
○ |
○ |
○ |
— | ||
| 10 | col_is_integer()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 11 | col_is_integer()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 12 | col_is_integer()
|
— |
|
✓ |
1 |
00.00 |
11.00 |
● |
● |
● |
— | ||
| 13 | col_is_character()
|
— |
|
✓ |
1 |
11.00 |
00.00 |
○ |
○ |
○ |
— | ||
| 2023-10-03 09:52:31 EDT < 1 s 2023-10-03 09:52:31 EDT | |||||||||||||
Similar to approach 1 in the manual_fixing code chunk,
so we will not show it in class.